Input




Preprocessing

Create gene-article matrix

### GET LIST OF GENES
list_gene_files = list.files('./../SFARI_scraping/genes/')


# GET LIST OF PUBMED IDS
get_pubmed_IDs = function(){
  all_pubmed_ids = c()
  for(gene_file in list_gene_files){
    file = read.delim(paste0('./../SFARI_scraping/genes/', gene_file))
    pubmed_ids = unique(file$pubmed.ID)
    all_pubmed_ids = unique(c(all_pubmed_ids, pubmed_ids)) 
  }
  all_pubmed_ids = as.character(all_pubmed_ids)
  return(all_pubmed_ids)
}

all_pubmed_ids = get_pubmed_IDs()


# CREATE GENE-ARTICLE (SPARSE) MATRIX
SFARI_genes_info = read.csv('./../../Data/SFARI_genes_01-15-2019.csv') %>% mutate('ID'=gene.symbol)

create_gene_pmID_mat = function(){
  
  # Create empty dataframe
  gene_pmID_mat = data.frame(matrix(0, nrow=length(list_gene_files), ncol=length(all_pubmed_ids)))
  rownames(gene_pmID_mat) = sapply(list_gene_files, function(x) gsub('.csv', '', x))
  colnames(gene_pmID_mat) = all_pubmed_ids
  
  # Fillout dataframe
  for(gene_file in list_gene_files){
    file = read.delim(paste0('./../SFARI_scraping/genes/', gene_file))
    pubmed_ids = as.character(unique(file$pubmed.ID))
    gene_pmID_mat[gsub('.csv', '', gene_file), pubmed_ids] = 1 
  }
  
  sparse_gene_pmID_mat = Matrix(as.matrix(gene_pmID_mat), sparse=TRUE)
  return(sparse_gene_pmID_mat)
}

gene_pmID_mat = create_gene_pmID_mat()

print(paste0('Genes: ', nrow(gene_pmID_mat), '     Articles: ', ncol(gene_pmID_mat)))
## [1] "Genes: 1053     Articles: 3709"
rm(list_gene_files)

Create article-Qualifier matrix

# LOAD AND PREPROCESS MESH TERMS
get_qualifiers = function(mesh_terms_by_article_list){
  
  qualifiers = c('abnormalities', 'administration & dosage', 'adverse effects', 'analogs & derivatives', 'analysis', 
                 'anatomy & histology', 'antagonists & inhibitors', 'biosynthesis', 'blood supply', 'cerebrospinal fluid',
                 'chemical synthesis', 'chemically induced', 'classification', 'complications', 'congenital',
                 'cytology', 'deficiency', 'diagnosis', 'diagnostic imaging', 'diet therapy', 'drug effects', 'drug therapy',
                 'economics', 'education', 'embryology', 'enzymology', 'epidemiology', 'ethics', 'ethnology', 'etiology',
                 'growth & development', 'history', 'immunology', 'injuries', 'innervation', 'instrumentation',
                 'isolation & purification', 'legislation & jurisprudence', 'manpower', 'metabolism', 'methods', 'microbiology',
                 'mortality', 'nursing', 'organization & administration', 'parasitology', 'pathology', 'pharmacokinetics',
                 'pharmacology', 'physiology', 'physiopathology', 'poisoning', 'prevention & control', 'psychology',
                 'radiation effects', 'radiotherapy', 'rehabilitation', 'secondary', 'secretion', 'standards',
                 'statistics & numerical data', 'supply & distribution', 'surgery', 'therapeutic use', 'therapy', 'toxicity',
                 'transmission', 'trends', 'ultrastructure', 'urine', 'utilization', 'veterinary', 'virology')
  
  qualifiers_by_article_list = list()
  
  for(article in names(mesh_terms_by_article_list)){
   
    mesh_terms = mesh_terms_by_article_list[[article]]
    article_qualifiers = c()
    
    if(!is.na(mesh_terms[1])){
      for(mesh_term in mesh_terms){
  
        # Look for qualifiers
        match_general = sapply(qualifiers, grepl, mesh_term)
        match_agonists = grepl('agonists', mesh_term) & !grepl('ntagonists', mesh_term)
        match_blood = grepl('blood', mesh_term) & !grepl('blood supply', mesh_term)
        match_chemistry = grepl('chemistry', mesh_term) & !grepl('immunohistochemistry', mesh_term)
        match_genetics = grepl('genetics', mesh_term) & !grepl('Xgenetics', mesh_term)
        
        # Add matches to the qualifiers list
        if(match_agonists) article_qualifiers = c(article_qualifiers, 'agonists')
        if(match_blood) article_qualifiers = c(article_qualifiers, 'blood')
        if(match_chemistry) article_qualifiers = c(article_qualifiers, 'chemistry')
        if(match_genetics) article_qualifiers = c(article_qualifiers, 'genetics')
        if(sum(match_general)>0) article_qualifiers = c(article_qualifiers, names(match_general)[match_general==TRUE])
        
      }
      # Update article entries
      qualifiers_by_article_list[[article]] = unique(article_qualifiers)
    }
    else qualifiers_by_article_list[[article]] = NA
    
  }
  return(qualifiers_by_article_list)
}

create_article_qualifier_mat = function(qualifiers_by_article_list){
  
  # GET LIST OF ARTICLES
  all_articles = names(qualifiers_by_article_list)
  
  # GET LIST OF QUALIFIERS
  all_qualifiers = c()
  for(mts in qualifiers_by_article_list) all_qualifiers = unique(c(all_qualifiers, mts))
  
  # CREATE ARTICLE-QUALIFIER MATRIX
  article_qualifier_mat = data.frame(matrix(0, nrow=length(all_articles), ncol=sum(!is.na(all_qualifiers))))
  rownames(article_qualifier_mat) = all_articles
  colnames(article_qualifier_mat) = all_qualifiers[!is.na(all_qualifiers)]
  
  for(article in all_articles){
    article_mesh_terms = qualifiers_by_article_list[[article]]
    if(sum(is.na(article_mesh_terms))==0){
      article_qualifier_mat[article, article_mesh_terms] = 1
    }
  } 
  
  article_qualifier_mat = Matrix(as.matrix(article_qualifier_mat), sparse=TRUE)
  
  return(article_qualifier_mat)
}

mesh_terms_by_article_list = read_json('./../Data/MeSH_terms_by_paper.json', simplifyVector=TRUE)
qualifiers_by_article_list = get_qualifiers(mesh_terms_by_article_list)
article_qualifier_mat = create_article_qualifier_mat(qualifiers_by_article_list)

print(paste0('Articles: ', nrow(article_qualifier_mat), '     Qualifiers: ', ncol(article_qualifier_mat)))
## [1] "Articles: 3707     Qualifiers: 65"

Create gene-qualifier matrix

gene_pmID_mat = gene_pmID_mat[,colnames(gene_pmID_mat) %in% rownames(article_qualifier_mat)]

# Check that ordering is the same in both matrices
if(!all(colnames(gene_pmID_mat) == rownames(article_qualifier_mat))){
  print("Article ordering doesn't match")
}

gene_qualifier_mat = gene_pmID_mat %*% article_qualifier_mat

print(paste0('Genes: ', nrow(gene_qualifier_mat), '     Qualifiers: ', ncol(gene_qualifier_mat)))
## [1] "Genes: 1053     Qualifiers: 65"

Remove genes without any associated Qualifier

to_keep = rowSums(gene_qualifier_mat)>0
print(paste0('Removing ', sum(!to_keep), ' genes'))
## [1] "Removing 14 genes"

SFARI score distribution of removed genes:

SFARI_genes_info$gene.score[!SFARI_genes_info$ID %in% rownames(gene_qualifier_mat)[to_keep]] %>% table(useNA='ifany')
## .
## 3 4 5 
## 3 6 5
# Remove genes
gene_qualifier_mat = gene_qualifier_mat[to_keep,]
rm(to_keep)

Analysis

genes_by_qualifier = data.frame('ID'=colnames(gene_qualifier_mat), 'n_genes'=colSums(gene_qualifier_mat))

ggplotly(genes_by_qualifier %>% ggplot(aes(reorder(ID, n_genes), n_genes, fill=sqrt(n_genes))) + geom_bar(stat='identity') + 
         ggtitle('No. Genes related to each Qualifier') + coord_flip() + scale_y_sqrt() +
         xlab('Qualifier') + ylab('No. Genes') + theme_minimal() + theme(legend.position = 'none'))
rm(genes_by_qualifier)

The higher the SFARI score, the more qualifiers it contains (makes sense because there was a relation between SFARI score and number of publications). The same relation exists between syndromic and non-syndromic genes.

qualifiers_by_gene = data.frame('ID'=rownames(gene_qualifier_mat), 'n_qualifiers'=rowSums(gene_qualifier_mat)) %>% 
                     left_join(SFARI_genes_info, by='ID') %>% dplyr::select(ID, n_qualifiers, gene.score, syndromic) %>%
                     mutate(gene.score = as.factor(gene.score), syndromic=as.factor(as.logical(syndromic)))

p1 = ggplotly(qualifiers_by_gene %>% ggplot(aes(gene.score, n_qualifiers, fill=gene.score)) + geom_boxplot() + 
         xlab('Gene Score') + ylab('No. Qualifiers') + theme_minimal() + theme(legend.position='none') +
         scale_fill_manual(values=SFARI_colour_hue()) + scale_color_manual(values=SFARI_colour_hue()))

p2 = ggplotly(qualifiers_by_gene %>% ggplot(aes(syndromic, n_qualifiers, fill=syndromic)) + geom_boxplot() + 
         ggtitle('Qualifiers related to each gene by score (left) and syndromic tag (right)') + 
         xlab('Syndromic') + ylab('No. Qualifiers') + theme_minimal() + theme(legend.position='none'))

subplot(p1, p2, nrows=1, widths=c(0.7,0.3))
rm(p1, p2)

Since scores 5 and 6 have no syndromic genes, this tag can be a confounder, but even when separating the genes both by score and syndromic tag, the relation persists

ggplotly(qualifiers_by_gene %>% ggplot(aes(gene.score, n_qualifiers, fill=gene.score)) + geom_boxplot() + 
         ggtitle('Qualifiers related to each gene by score and syndromic tag') + 
         xlab('Gene Score') + ylab('No. Qualifiers') + facet_wrap(~syndromic) +    
         scale_fill_manual(values=SFARI_colour_hue()) + scale_color_manual(values=SFARI_colour_hue()) +
         theme_minimal() + theme(legend.position='none'))
rm(qualifiers_by_gene)

Gene Network Analysis

Create Network

On average, each node is connected to half of the nodes -> strongly connected Network

# EDGES
gene_gene_mat = gene_qualifier_mat %*% t(gene_qualifier_mat)
gene_names = rownames(gene_qualifier_mat)

edges = summary(gene_gene_mat) %>% rename('from'=i, 'to'=j, 'count'=x) %>% 
        mutate(from = gene_names[from], to = gene_names[to]) %>%
        filter(from!=to)

# Convert count to fraction (multiplying by 100 to make the numbers bigger)
gene_qualifier_mat_rowsums = rowSums(gene_qualifier_mat)
names(gene_qualifier_mat_rowsums) = rownames(gene_gene_mat)
edges = edges %>% mutate('weight'=count/(gene_qualifier_mat_rowsums[from]+gene_qualifier_mat_rowsums[to]))

# NODES
nodes = SFARI_genes_info %>% dplyr::select(gene.symbol, gene.score, syndromic, number.of.reports) %>% 
        mutate(id = gene.symbol, title=gene.symbol, value=number.of.reports,
               shape=ifelse(syndromic==0, 'circle','square'),
               color_score=case_when(gene.score==1~'#FF7631', gene.score==2~'#FFB100',
                                     gene.score==3~'#E8E328', gene.score==4~'#8CC83F',
                                     gene.score==5~'#62CCA6', gene.score==6~'#59B9C9',
                                     is.na(gene.score) ~ '#b3b3b3')) %>%
        mutate('color'=color_score) %>% filter(!duplicated(id)) %>% 
        filter(gene.symbol %in% unique(c(edges$from, edges$to)))

# Details:
print(paste0('Nodes: ', nrow(nodes),'        Edges: ', nrow(edges)/2))
## [1] "Nodes: 1039        Edges: 537956"

Calculate eigencentrality by Gene Score

The higher the SFARI score, the higher the eigencentrality of the genes in the Network. Same with syndromic tag

graph = graph_from_data_frame(edges, vertices=nodes, directed=FALSE)
graph = graph %>% simplify(edge.attr.comb='first')

eigen_centrality_output = eigen_centrality(graph)
eigencentr = as.numeric(eigen_centrality_output$vector)
graph = graph %>% set_vertex_attr('eigencentrality', value=eigencentr)

plot_data = data.frame('gene.score'= as.factor(vertex_attr(graph, 'gene.score')),
                       'Syndromic'= as.logical(vertex_attr(graph, 'syndromic')),
                       'Eigencentrality'= vertex_attr(graph, 'eigencentrality'))

correlation_scr = cor(nodes$gene.score[!is.na(nodes$gene.score)], eigencentr[!is.na(nodes$gene.score)])
correlation_syn = cor(as.numeric(plot_data$Syndromic), plot_data$Eigencentrality)

p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, Eigencentrality, fill=gene.score)) + geom_boxplot() + 
         scale_fill_manual(values=SFARI_colour_hue()) + theme_minimal() + theme(legend.position='none'))

p2 = ggplotly(plot_data %>% ggplot(aes(Syndromic, Eigencentrality, fill=Syndromic)) + geom_boxplot() + 
     theme_minimal() + theme(legend.position='none') + 
     ggtitle(paste0('Correlation = ', round(correlation_scr, 2), ' (left) and ', round(correlation_syn,2), ' (right)')))

subplot(p1, p2, nrows=1, widths=c(0.7, 0.3))
rm(correlation_scr, correlation_syn, p1, p2)

When separating the SFARI scores by syndromic tag, the relation between eigencentrality and score persists

corr_non_syn = cor(nodes$gene.score[!is.na(nodes$gene.score) & nodes$syndromic==0], 
                   eigencentr[!is.na(nodes$gene.score) & nodes$syndromic==0])
corr_syn = cor(nodes$gene.score[!is.na(nodes$gene.score) & nodes$syndromic==1], 
               eigencentr[!is.na(nodes$gene.score) & nodes$syndromic==1])
ggplotly(plot_data %>% ggplot(aes(gene.score, Eigencentrality, fill=gene.score)) + geom_boxplot() + 
         scale_fill_manual(values=SFARI_colour_hue()) + theme_minimal() + theme(legend.position='none') +
         facet_wrap( ~ Syndromic, ncol=2) + ggtitle(paste0('Correlation = ', round(corr_non_syn, 4),
                                                           ' (left) and ', round(corr_syn, 4), ' (right)')))
rm(plot_data, corr_non_syn, corr_syn, nodes, edges)
plot_data = data.frame('id' = graph %>% get.vertex.attribute('name'), 'eigencentrality' = eigencentr) %>%
            top_n(25, wt=eigencentrality) %>% left_join(SFARI_genes_info, by=c('id'='ID'))
## Warning: Column `id`/`ID` joining factors with different levels, coercing
## to character vector
ggplotly(plot_data %>% ggplot(aes(reorder(id, eigencentrality), eigencentrality, fill=as.factor(gene.score))) + 
         geom_bar(stat='identity') + scale_fill_manual(values=SFARI_colour_hue(), na.value='#b3b3b3') + 
         coord_flip() + ggtitle(paste0('Top 25 genes with the highest Network centrality')) + 
         xlab('') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
rm(plot_data)

Note: Network is too densely connected to plot (too heavy and not very informative)

Community detection:

comms_louvain = cluster_louvain(graph)
modules = comms_louvain %>% membership

color_pal = viridis_pal()(max(modules)) %>% substr(1,7)
graph = graph %>% set_vertex_attr('module', value=as.numeric(modules)) %>% 
                  set_vertex_attr('color_modules', value=color_pal[as.numeric(modules)]) %>%
                  set_vertex_attr('color', value=color_pal[as.numeric(modules)])

comm_sizes = sizes(comms_louvain) %>% data.frame %>% rename(Module=Community.sizes, size=Freq)
ggplotly(comm_sizes %>% ggplot(aes(Module, size, fill=size)) + geom_bar(stat='identity') + 
         ggtitle('Number of genes in each module') + theme_minimal() + theme(legend.position = 'none'))
hm_data = table(vertex_attr(graph, 'gene.score'), vertex_attr(graph, 'module'), useNA='ifany') %>% 
          as.data.frame.matrix %>% as.matrix
heatmap.2(hm_data, cellnote=hm_data, notecol='white', Rowv=F, Colv=F, scale='row', trace='none', 
          key=FALSE, xlab='Module', ylab='SFARI score', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))

hm_data = table(vertex_attr(graph, 'syndromic'), vertex_attr(graph, 'module'), useNA='ifany') %>% 
          as.data.frame.matrix %>% as.matrix
heatmap.2(hm_data, cellnote=hm_data, notecol='white', Rowv=F, Colv=F, scale='row', trace='none', 
          key=FALSE, xlab='Module', ylab='Syndromic', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))

Separating by syndromic tag the SFARI scores

hm_data = data.frame('syndromic_gene.score'=paste(vertex_attr(graph, 'syndromic'), '_',vertex_attr(graph, 'gene.score')),
                     'module'=vertex_attr(graph, 'module'))
hm_data = table(hm_data$syndromic_gene.score, hm_data$module) %>% as.data.frame.matrix %>% as.matrix
heatmap.2(hm_data, cellnote=hm_data, notecol='white', scale='row', trace='none', 
          key=FALSE, xlab='Module', ylab='Syndromic tag + SFARI Score', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))

rm(color_pal, hm_data)

Module characterisation

Most important genes for each module coloured by SFARI score

module_info = tibble('module'=character(), 'top_term'=character(), size=integer())

for(i in names(sizes(comms_louvain))){
  
  module_vertices = names(modules)[modules==i]
  
  # Creat subgraph
  module_graph = induced_subgraph(graph, module_vertices)
  
  # Calculate eigencentrality
  eigen_centrality_output = eigen_centrality(module_graph)
  eigencentr = as.numeric(eigen_centrality_output$vector)
  names(eigencentr) = module_vertices
  
  module_info = module_info %>% add_row('module'=i, 'top_term'=names(eigencentr)[eigencentr==max(eigencentr)][1], 
                                        'size'=length(module_vertices))
    
  # Plot eigencentrality of top 10 genes
  plot_data = data.frame('gene'=names(eigencentr), 'eigencentrality'=eigencentr) %>%
              top_n(10, wt=eigencentrality) %>% left_join(SFARI_genes_info, by=c('gene'='ID'))
  print(plot_data %>% ggplot(aes(reorder(gene, eigencentrality), eigencentrality, fill=as.factor(gene.score))) + 
       geom_bar(stat='identity') + scale_fill_manual(values=SFARI_colour_hue(), na.value='#b3b3b3') + coord_flip() + 
       ggtitle(paste0('Top genes for Module ', i)) + xlab('Genes') + ylab('Eigencentrality') + 
       theme_minimal() + theme(legend.position='none'))

}

rm(i, module_vertices, module_graph, eigen_centrality_output, eigencentr, plot_data)

Most important Qualifiers for each module

for(i in names(sizes(comms_louvain))){
  
  module_genes = names(modules)[modules==i]
  
  # Filter genes belonging to module and column with non-zero values
  module_gene_qualifier_mat = gene_qualifier_mat[rownames(gene_qualifier_mat) %in% module_genes,]
  module_gene_qualifier_mat = module_gene_qualifier_mat[,colSums(module_gene_qualifier_mat)>0]
  
  # Do PCA and extract 1st PC
  pca_module = prcomp(module_gene_qualifier_mat, scale.=TRUE)
  module_eigengene = pca_module$rotation[,'PC1']
  names(module_eigengene) = colnames(module_gene_qualifier_mat)
  
  # Plot module's most relevant qualifiers
  plot_data = data.frame('qualifier'=names(module_eigengene), 'eigencentrality'=module_eigengene) %>%
              top_n(25, wt=abs(eigencentrality))
  print(plot_data %>% ggplot(aes(reorder(qualifier, abs(eigencentrality)), eigencentrality, fill=abs(eigencentrality))) + 
       geom_bar(position='identity', stat='identity') + scale_fill_viridis_c() + coord_flip() + 
       ggtitle(paste0('Top Qualifiers for Module ', i)) + 
       xlab('Qualifiers') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
  
}

rm(i, module_genes, module_gene_qualifier_mat, pca_module, module_eigengene, plot_data)